https://satijalab.org/seurat/articles/pbmc3k_tutorial.html

library(readr)
EZ_Batch1 <- read_csv("EZ_Batch1.csv")
## New names:
## * `` -> ...1
## Rows: 18438 Columns: 31734
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr     (1): ...1
## dbl (31733): EZ.Batch1_HipR-AD-G3-2w_GTACTTTCACATGGGA, EZ.Batch1_HipR-AD-G3-...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
EZ_Batch1<-as.data.frame(EZ_Batch1)
head(EZ_Batch1)
library(Seurat)
## Attaching SeuratObject
colnames(EZ_Batch1)[1] <- "gene"
rownames(EZ_Batch1) <- EZ_Batch1$gene
EZ_Batch1$gene <- NULL
head(EZ_Batch1)
library(data.table)


# transpose
tez <- transpose(EZ_Batch1)

# get row and colnames in order
colnames(tez) <- rownames(EZ_Batch1)
rownames(tez) <- colnames(EZ_Batch1)
obj <- CreateSeuratObject(counts=EZ_Batch1, project="EZ_Batch1", min.cells = 3, min.features = 200)
obj
## An object of class Seurat 
## 17448 features across 31733 samples within 1 assay 
## Active assay: RNA (17448 features, 0 variable features)
obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-")
VlnPlot(obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of percent.mt.

plot1 <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
## Warning in cor(x = data[, 1], y = data[, 2]): the standard deviation is zero
plot2 <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1

plot2

obj <- subset(obj, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
obj <- NormalizeData(obj, normalization.method = "LogNormalize", scale.factor = 10000)
obj <- FindVariableFeatures(obj, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(obj), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(obj)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1

plot2

all.genes <- rownames(obj)
obj <- ScaleData(obj, features = all.genes)
## Centering and scaling data matrix
obj <- RunPCA(obj, features = VariableFeatures(object = obj))
## PC_ 1 
## Positive:  Meg3, Nrg3, Tenm2, Opcml, Snhg11, Epha6, Kcnip4, Nrxn3, Gria1, Cntnap2 
##     4930419G24Rik, Frmpd4, Nav3, Ptprd, Cacnb2, Phactr1, Xkr4, Kcnq5, Fam19a1, Msra 
##     Slc8a1, Galntl6, Arpp21, Hs6st3, Chrm3, Hcn1, Ryr3, Kcnh7, 9530059O14Rik, Stxbp5l 
## Negative:  Apoe, Slc1a3, Slc1a2, Plpp3, Atp1a2, Cst3, Gpc5, Clu, Cd81, Sparcl1 
##     Gja1, Ndrg2, F3, Wdr17, Npas3, Mt2, Rorb, Phkg1, Ntsr2, Pla2g7 
##     Rgs20, Prex2, Mt1, Ptprz1, Hepacam, Gpr37l1, Id3, Glul, Ptn, Bcan 
## PC_ 2 
## Positive:  Slc1a3, Gpc5, Slc1a2, Plpp3, Wdr17, Apoe, Clu, F3, Ptprz1, Rorb 
##     Npas3, Bcan, Phkg1, Ndrg2, Ntsr2, Hepacam, Rgs20, Pla2g7, Slc4a4, Ntm 
##     Gpr37l1, Slc7a10, Gja1, Cst3, Aldoc, Htra1, Id4, Cldn10, Mlc1, Bmpr1b 
## Negative:  Slco1a4, Flt1, Cldn5, Rgs5, Klf2, Adgrl4, Itm2a, Ly6c1, Ly6a, Ebf1 
##     Pglyrp1, Igfbp7, Esam, Eng, Ctla2a, Nostrin, Ptprb, Hspb1, Abcb1a, Mecom 
##     Slc7a5, Lef1, Cyyr1, Slc22a8, Slc2a1, Acvrl1, H2-D1, Cxcl12, Pltp, Emcn 
## PC_ 3 
## Positive:  Ntm, Hcn1, Galntl6, Etl4, Cntnap2, Pde4d, Agbl4, Tenm3, Grip1, Snhg11 
##     Slc8a1, Chrm3, Lrrc4c, Sox5, Gabrg3, Npas3, Pid1, Dpp10, Mgat4c, Slc35f1 
##     Ptprt, Ldb2, Nrxn3, Dmd, Kcnh7, Sgcz, Thrb, Gria4, Tenm4, Cit 
## Negative:  Dock10, Stxbp6, Cdh9, Ahcyl2, Trpm3, Gm28376, Adarb2, Lrrtm4, Lingo2, Tmem108 
##     Kcnip4, Frmd4b, Rfx3, Synpr, Epha7, Fam19a2, Sema5a, Dgkh, Ptchd4, Plcl1 
##     Il1rap, C1ql3, Prox1, Trpc6, Plk5, Ano3, Mctp1, Fam19a1, Gm20754, Gm42864 
## PC_ 4 
## Positive:  Fam19a1, Cacnb2, Ryr3, Dock4, Hs6st3, Khdrbs3, Pde1a, Chrm3, Iqgap2, Epha6 
##     Sema3e, Dmd, Pkp2, Rnf182, Msra, 4930419G24Rik, Kcnip4, Man1a, Arl15, Gm4128 
##     Brd9, Shisa6, 9530059O14Rik, Hs3st4, Sv2b, Prkca, Mdga1, Pex5l, Neurod6, Slc8a1 
## Negative:  Erbb4, Nxph1, Kcnmb2, Sgcz, Grik1, Dlx6os1, Col19a1, Cit, Rbms3, Dpp10 
##     4930555F03Rik, Grip1, Gad2, Gm38505, Gria4, Lrrtm4, Kcnip1, Zfp536, Cntnap4, Frmd5 
##     Adarb2, Coro6, Unc5d, Alk, Cntn5, Btbd11, Ptprm, Gm13629, Maf, Mef2c 
## PC_ 5 
## Positive:  Ntm, Gpc5, Slc1a2, Wdr17, Bcan, Luzp2, Plpp3, Ntsr2, F3, Atp1a2 
##     Cdh10, Slc7a10, Chrm3, Galntl6, Ptn, Mertk, Slco1c1, Rgs20, Rorb, Cntnap2 
##     Htra1, Glul, Cldn10, Cst3, Kcnh7, Mfge8, Kcnn2, Ndrg2, Grm3, Slc1a3 
## Negative:  1700007G11Rik, Dnah12, Odf3b, Gm867, Ak7, Cfap43, Mapk15, Gm13481, Spag16, Tmem212 
##     Spag17, Wdr49, Hydin, Ankfn1, Ankub1, Gm10714, Dnah6, Spef2, Lrrc36, Ccdc153 
##     Lrrc43, Ccdc170, Cdhr3, Ak9, Dnaaf3, Dnah11, Enkur, Vwa3a, Rarres2, FRMPD2
print(obj[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  Meg3, Nrg3, Tenm2, Opcml, Snhg11 
## Negative:  Apoe, Slc1a3, Slc1a2, Plpp3, Atp1a2 
## PC_ 2 
## Positive:  Slc1a3, Gpc5, Slc1a2, Plpp3, Wdr17 
## Negative:  Slco1a4, Flt1, Cldn5, Rgs5, Klf2 
## PC_ 3 
## Positive:  Ntm, Hcn1, Galntl6, Etl4, Cntnap2 
## Negative:  Dock10, Stxbp6, Cdh9, Ahcyl2, Trpm3 
## PC_ 4 
## Positive:  Fam19a1, Cacnb2, Ryr3, Dock4, Hs6st3 
## Negative:  Erbb4, Nxph1, Kcnmb2, Sgcz, Grik1 
## PC_ 5 
## Positive:  Ntm, Gpc5, Slc1a2, Wdr17, Bcan 
## Negative:  1700007G11Rik, Dnah12, Odf3b, Gm867, Ak7
DimHeatmap(obj, dims = 1, cells = 500, balanced = TRUE)

ElbowPlot(obj)

obj <- FindNeighbors(obj, dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
obj <- FindClusters(obj, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 31685
## Number of edges: 1097192
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9394
## Number of communities: 22
## Elapsed time: 10 seconds
head(Idents(obj), 5)
## EZ.Batch1_HipR-AD-G3-2w_GTACTTTCACATGGGA 
##                                       13 
## EZ.Batch1_HipR-AD-G3-2w_TGCGGGTAGTGAATTG 
##                                       13 
## EZ.Batch1_HipR-WT-G3-2w_ACACCAAAGAGAACAG 
##                                       13 
## EZ.Batch1_HipR-WT-G3-2w_ACACCCTAGGCGCTCT 
##                                       13 
## EZ.Batch1_HipR-WT-G3-2w_CAACTAGAGTCCCACG 
##                                       13 
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
obj <- RunUMAP(obj, dims = 1:20)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 14:59:27 UMAP embedding parameters a = 0.9922 b = 1.112
## 14:59:27 Read 31685 rows and found 20 numeric columns
## 14:59:27 Using Annoy for neighbor search, n_neighbors = 30
## 14:59:27 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:59:32 Writing NN index file to temp file D:\Users\YH881\AppData\Local\Temp\31\Rtmp63kEZc\file5cb431374489
## 14:59:36 Searching Annoy index using 1 thread, search_k = 3000
## 14:59:51 Annoy recall = 100%
## 14:59:52 Commencing smooth kNN distance calibration using 1 thread
## 14:59:55 Initializing from normalized Laplacian + noise
## 15:00:01 Commencing optimization for 200 epochs, with 1437414 positive edges
## 15:00:53 Optimization finished
DimPlot(obj, reduction = "umap", label=TRUE)

x_markers <- FindMarkers(obj, ident.1 = 1, min.pct = 0.25,  test.use = "wilcox")
## For a more efficient implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the limma package
## --------------------------------------------
## install.packages('BiocManager')
## BiocManager::install('limma')
## --------------------------------------------
## After installation of limma, Seurat will automatically use the more 
## efficient implementation (no further action necessary).
## This message will be shown once per session
# find all markers of each clusters
for (x in 1:21) {
  print(x)
  x_markers <- FindMarkers(obj, ident.1 = x, min.pct = 0.25,  test.use = "wilcox")
  write.csv(x_markers,sprintf("markers/c%s.csv",x)
)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21

Check out our analysis in the find_marker.ipynb. We found that cluster 17 is microglia

c17 <- subset(obj, idents = c("17"))
saveRDS(c17, file = "c17.rds")

```